
# NOTES ------------------------------------------------------------------------
#   1. Sample from households with at least 12 bills
#   2. Keep bills between 28 and 34 days
#   3. Limit to 2010–2014 (PGE covers 2002–2014; SoCal covers 2010–2015)

# Description ------------------------------------------------------------------
#   1. Sample a percentage of households from each PGE and SoCal zip code.
#   2. Estimate elasticities using (1) avg. price, (2) mrg. price, and
#       (3) sim. mrg. price for (A) current prices, (B) 1-lag of price, and
#       (C) 2-lags of price.

# Setup ------------------------------------------------------------------------
  # Options
  options(stringsAsFactors = F)
  # Packages
  library(pacman)
  p_load(stringr, data.table, lfe, lubridate, foreach, doParallel, magrittr)
  # Directories
  dir_project <- "S:/Edward/Elasticity/"
  dir_csv     <- paste0(dir_project, "DataCsv/")
  dir_rds     <- paste0(dir_project, "DataR/")
  dir_pge     <- paste0(dir_rds, "Prices/PGE/")
  dir_socal   <- paste0(dir_rds, "Prices/SoCal/")
  dir_sample  <- paste0(dir_rds, "Prices/SampledSubsets/")
  dir_save    <- paste0(dir_rds, "Results/Results20170721/")
  dir_figures <- paste0(dir_project, "Figures/")
  # More options
  options(lfe.threads = 32)
  options(datatable.print.nrows = 10)
  # Set seed
  set.seed(12345)

# Define desired variables -----------------------------------------------------
  # Define variables that I want without lags
  var_nolag <- c(
    "begin_date", "end_date", "year", "ym",
    "zip", "zip_plus4", "zip9",
    "care", "care_pct",
    "hh_id", "hh_month",
    "climate", "rate_sch", "utility",
    "allowance", "rev", "pct_flags",
    NULL)
  # Define variables with which we want lags
  var_lags <- c(
    # "allowance",
    "thm", "thm_day",
    "days", "month",
    # paste0("t_p_", str_pad(1:14, 2, "left", 0)),
    "bill_hdd",
    "east_year_hdd", "ca_year_hdd",
    # "east_year_cdd", "ca_year_cdd",
    # "east_year_dd", "ca_year_dd",
    "total_bill",
    "bill_flag",
    # "care",
    "price_avg", "price_avg_mrg",
    "price_base", "price_base_ppps",
    # "price_excess", "price_excess_ppps",
    # "price_base_init", "price_excess_init",
    # "ppps_charge",
    "price_mrg", "price_mrg_ppps",
    "mrg_sim_iv_10_14", "mrg_sim_iv_11_13",
    "spot_price_1week",
    "spot_price_2week",
    "spot_price_3week",
    # "spot_price_1week_init", "spot_price_2week_init", "spot_price_3week_init",
    NULL)
  # Define lags and leads
  the_lags <- c(1:4) %>% paste0("_lag", .)
  the_leads <- c(1) %>% paste0("_lead", .)
  # Defined all desired variables (including lags and leads)
  all_vars <- c(
    var_nolag, var_lags,
    paste0(rep(var_lags, each = length(the_lags)), the_lags),
    paste0(rep(var_lags, each = length(the_leads)), the_leads))
  # Clean up a bit
  rm(var_nolag, var_lags, the_leads, the_lags)

# Function: Load data ----------------------------------------------------------
  zip_loader <- function(the_zip, the_utility, all_vars, dir_rds) {
    # Build the file name
    the_file <- paste0(dir_rds, "Prices/", the_utility, "/", "prices_",
      tolower(the_utility), "_", the_zip, ".rds")
    # Open the associated file
    bill_dt <- readRDS(the_file)
    # Keep only the desired variables (those in 'all_vars')
    bill_dt[, setdiff(names(bill_dt), all_vars) := NULL]
    # Return bill_dt
    return(bill_dt)
  }

# Function: Clean data ---------------------------------------------------------
  zip_cleaner <- function(bill_dt) {
    # Find the most extreme 1 percent of observations on therms
    thm_lo <- quantile(x = bill_dt[,thm], probs = 0.005)
    thm_hi <- quantile(x = bill_dt[, thm], probs = 0.995)
    # Find the most extreme 1 percent of observations on revenue
    rev_lo <- quantile(x = bill_dt[, rev], probs = 0.005)
    rev_hi <- quantile(x = bill_dt[, rev], probs = 0.995)
    # Flag observations outside of the central 99%
    bill_dt[, `:=`(
      flag_thm = 1 * ((thm > thm_hi) | (thm < thm_lo)),
      flag_rev = 1 * ((rev > rev_hi) | (rev < rev_lo))
      )]
# NOTE: May want to move this step to analysis file
    # Keep bills with 28-34 days
    bill_dt <- bill_dt[between(days, 28, 34)]
    # Restrict to set of households with at least 12 bills
    bill_dt[, n_bills := .N, by = hh_id]
    bill_dt <- bill_dt[n_bills >= 12]
    # Drop households if more than 50% of their bills are flagged
    bill_dt <- bill_dt[pct_flags < 0.50]
    # Return bill_dt
    return(bill_dt)
  }

# Function: Add additional variables -------------------------------------------
  zip_adder <- function(bill_dt) {
    # Spatiotemporal controls
    bill_dt[, hh_year := paste0(hh_id, "_", year)]
    bill_dt[, zip_ym := paste0(zip, "_", ym)]
    bill_dt[, zip_ym_utility := paste0(zip_ym, utility)]
    bill_dt[, zip_year := paste0(zip, "_", year)]
    bill_dt[, week := week(begin_date)]
    bill_dt[, yw := paste0(year, str_pad(week, 2, "left", 0))]
    bill_dt[, zip_yw := paste0(zip, "_", yw)]
    # Change HDD and CDD variables to thousands
    vars_hdd <- bill_dt %>% names() %>% grep("hdd", ., value = T)
    vars_cdd <- bill_dt %>% names() %>% grep("cdd", ., value = T)
    for (j in c(vars_cdd, vars_hdd)) {
      set(x = bill_dt, j = j, value = bill_dt[[j]] / 1e3)
    }
    rm(j, vars_hdd, vars_cdd); gc()
    # Return bill_dt
    return(bill_dt)
  }

# Function: Sample households and save -----------------------------------------
  zip_sampler <- function(bill_dt, the_zip, the_utility, pct, dir_sample) {
    # Find all IDs
    id_dt <- bill_dt[,"hh_id"] %>% unique()
    # Sample 'pct' percent of the IDs (rounding up to nearest integer)
    ids_sampled <- sample(
      x = id_dt[,hh_id],
      size = ceiling(nrow(id_dt) * pct))
    # Grab the subset of sampled households from bill_dt
    sample_dt <- bill_dt[hh_id %in% ids_sampled]
    # Save the sampled dataset
    saveRDS(
      object = sample_dt,
      file = paste0(dir_sample, the_utility,
        str_pad(round(100 * pct, 0), 2, "left", 0),
        "Percent/", tolower(the_utility), the_zip, ".rds"))
  }

# Function: Load and sample from a given zip code ------------------------------
  # NOTE: the_utility' should be 'PGE' or 'SoCal'
  zip_runner <- function(the_zip, the_utility, all_vars, dir_rds, pct, dir_sample) {
      zip_loader(the_zip, the_utility, all_vars, dir_rds) %>%
        zip_cleaner() %>%
        zip_adder() %>%
        zip_sampler(the_zip, the_utility, 0.05, dir_sample)
  }

# Find the individual zip codes ------------------------------------------------
  # Find PGE's zip codes (for which we have data)
  pge_zips <- dir_pge %>% dir() %>%
    str_replace("prices_pge_", "") %>%
    str_replace(".rds", "")
  # Repeat for SCG
  socal_zips <- dir_socal %>% dir() %>%
    str_replace("prices_socal_", "") %>%
    str_replace(".rds", "")

# PGE: Sample from each zip code -----------------------------------------------
  # Start cluster
  cl <- makeCluster(8)
  registerDoParallel(cl)
  # Loop over the PG&E zip codes
  foreach(the_zip = pge_zips,
    .packages = c("data.table", "magrittr", "lubridate", "stringr")) %dopar% {
      # Run the data prep and sampling functions for the zip code
      try(zip_runner(
        the_zip = the_zip,
        the_utility = "PGE",
        all_vars = all_vars,
        dir_rds = dir_rds,
        dir_sample = dir_sample))
  }
  # Kill cluster
  stopCluster(cl)
  # Clean up
  gc()

# SoCal: Sample from each zip code ---------------------------------------------
  # Start cluster
  cl <- makeCluster(8)
  registerDoParallel(cl)
  # Loop over the SoCal zip codes
  foreach(the_zip = socal_zips,
    .packages = c("data.table", "magrittr", "lubridate", "stringr")) %dopar% {
      # Run the data prep and sampling functions for the zip code
      try(zip_runner(
        the_zip = the_zip,
        the_utility = "SoCal",
        all_vars = all_vars,
        dir_rds = dir_rds,
        dir_sample = dir_sample))
  }
  # Kill cluster
  stopCluster(cl)
  # Clean up
  gc()

# PGE: Load all samples --------------------------------------------------------
  # Find all of the saved samples
  samples_pge <- dir_sample %>%
    paste0("PGE05Percent") %>% dir(full.names = T)
  # Load each of the samples and bind them together
  pge_dt <- lapply(X = samples_pge, FUN = readRDS) %>% rbindlist()
  # Save the PGE sample
  saveRDS(
    object = pge_dt,
    file = paste0(dir_sample, "pge05PctAll.rds"))
  # Drop observations outside of 2010–2014
  pge_dt <- pge_dt[between(year, 2010, 2014)]
  gc()
  # Save the PGE sample
  saveRDS(
    object = pge_dt,
    file = paste0(dir_sample, "pge05Pct2010to2014.rds"))

# SoCal: Load all samples ------------------------------------------------------
  # Find all of the saved samples
  samples_socal <- dir_sample %>%
    paste0("SoCal05Percent") %>% dir(full.names = T)
  # Load each of the samples and bind them together
  socal_dt <- lapply(X = samples_socal, FUN = readRDS) %>% rbindlist()
  # Save the SoCal sample
  saveRDS(
    object = socal_dt,
    file = paste0(dir_sample, "socal05PctAll.rds"))
  # Drop observations outside of 2010–2014
  socal_dt <- socal_dt[between(year, 2010, 2014)]
  gc()
  # Save the PGE sample
  saveRDS(
    object = socal_dt,
    file = paste0(dir_sample, "socal05Pct2010to2014.rds"))

# Build a joint dataset --------------------------------------------------------
  # Find overlapping data range
  first_date <- max(min(pge_dt[,begin_date]), min(socal_dt[,begin_date]))
  last_date <- min(max(pge_dt[,begin_date]), max(socal_dt[,begin_date]))
  # Drop observations outside date range
  pge_dt <- pge_dt[between(begin_date, first_date, last_date)]
  socal_dt <- socal_dt[between(begin_date, first_date, last_date)]
  gc()
  # Join the two utilities' data into a single dataset
  bill_dt <- rbindlist(list(pge_dt, socal_dt), use.names = T, fill = T)
  gc()
  # Drop utilities' datasets
  rm(pge_dt, socal_dt, first_date, last_date)
  gc()
  # Load the zip-city map and restrict to California; drop state
  zip_city <- fread(paste0(dir_csv, "zipCodeDatabase.csv"),
    select = c("zip", "state", "primary_city"))[state == "CA"][, state := NULL]
  # Change name of "primary_city" to "city"
  setnames(zip_city, "primary_city", "city")
  # Add the city
  bill_dt %<>% merge(y = zip_city, by = "zip", all.x = T, all.y = F, sort = F)
  # Add city-year, city-month, city-year-month, and city-year-week variables
  bill_dt[, `:=`(
    city_year  = paste0(city, year),
    city_month = paste0(city, month),
    city_ym    = paste0(city, ym),
    city_yw    = paste0(city, yw)
    )]
  # Add city-year-week-utility variable
  bill_dt[, city_yw_utility := paste0(city_yw, utility)]
  # Create seasons
  bill_dt[between(month(begin_date), 4, 9), season := "summer"]
  bill_dt[!between(month(begin_date), 4, 9), season := "winter"]
  # Find the first (integer) date in the sample
  day1 <- bill_dt[, begin_date] %>% unclass() %>% min()
  # Add variable for price cluster: a combination of billing cycle,
  # utility, and climate zone
  # NOTE: The utilities code their climate zone differently
  bill_dt[, price_cluster := paste0(
    unclass(begin_date) - day1, "_",
    unclass(end_date) - day1, "_",
    climate)]
  # Save the joint dataset
  saveRDS(
    object = bill_dt,
    file = paste0(dir_sample, "joint05Pct2010to2014.rds"))
